Load R libraries

library(tidyverse)
library(readxl)
library(janitor)
library(lubridate)
library(data.table)
library(tidytext)
library(rvest)
library(httr)
library(wdman)

Scrape NPR.org

NPR disables automated access, so spin up a Selenium server to build list of story headlines and links from NPR’s Climate archive going back to Oct. 1, 2022.

#Start Selenium server
selServ <- selenium(
  port = 4444L,
  version = 'latest',
  chromever = '103.0.5060.134'
)

remDr <- remoteDriver(
  remoteServerAddr = 'localhost',
  port = 4444L,
  browserName = 'firefox'
)

Sys.sleep(5)

#Open browser
remDr$open()

#Go to NPR's climate archive, starting from today
climate <- paste0("https://www.npr.org/sections/climate/archive?date=", format(Sys.Date(), format="%m-%d-%Y"))
remDr$navigate(climate)
Sys.sleep(5)

#Grab HTML source code
source <- remDr$getPageSource()[[1]]

#Page is dynamically rendered as a user scrolls, so scroll down page until September's stories load
while (!str_detect(source, "datetime=\"2022-09-29\"")){
  webElem <- remDr$findElement("css", "body")

  webElem$sendKeysToElement(list(key = "down_arrow"))
  Sys.sleep(1)
  source <- remDr$getPageSource()[[1]]
}

#Read in the page source
page <- read_html(source)

#Pull links for all stories since October
links <- page %>%
  html_nodes ("h2") %>%
  html_nodes ("a") %>%
  html_attr('href') %>%
  as.data.frame() %>%
  magrittr::set_colnames("url")

#Pull headlines for all stories since October
headline <- page %>%
  html_nodes ("h2") %>%
  html_nodes ("a") %>%
  html_text() %>%
  as.data.frame() %>%
  magrittr::set_colnames("headline")

#Calculate date published from URL - 2 failed to parse because Short Wave episodes are named differently
date <- links %>%
  mutate (date = str_remove_all(url, "^https://www.npr.org/")) %>%
  mutate (date = str_remove_all(date, "^sections/goatsandsoda/")) %>%
  mutate (date = str_remove_all(date, "^sections/pictureshow/")) %>%
  mutate (date = str_remove_all(date, "^sections/health-shots/")) %>%
  mutate (date = ymd(substr(date,1,10)))

#Check any dates that don't start with numbers?
date %>%
  filter (!str_detect(date, "[0-9]"))

#Create list of articles to scrape
df <- cbind (date, headline) %>%
  #Keep only those since Oct. 1, 2022
  filter (date > as.Date('2022-09-30'))

Check results

df <- read_csv("npr_scrape.csv") %>% select (2,3,1)

head (df)
## # A tibble: 6 × 3
##   date       headline                                                      url  
##   <date>     <chr>                                                         <chr>
## 1 2023-04-05 Scientists warn California's floods may be a sample of Megaf… http…
## 2 2023-04-04 There are plenty of doomsday climate stories — 'Extrapolatio… http…
## 3 2023-04-04 Some lawmakers aim to include measures to curb climate chang… http…
## 4 2023-04-03 Farms could help sustain Texas' oyster industry amid climate… http…
## 5 2023-04-03 Activist investors press corporations to take action against… http…
## 6 2023-04-02 Why deforestation means less rain in tropical forests         http…

Scrape story text

This code takes the list of URLs and headlines created above and visits each URL in sequence to scrape the story text, byline and other details. It also remove unnecessary words, like reporter names, NPR jargon and descriptions of ambient sound that appear frequently but are not representative of the text corpus.

output <- tibble()

for (i in 1:length(df$url)){
#for (i in 1:20){
  #i <- 9
  link <- df$url[i]
  #page <- html_session(link)
  #html <- content(page$response)

  remDr$navigate(link)
  Sys.sleep(4)
  
  source <- remDr$getPageSource()[[1]]
  html <- read_html(source)
  done=0
  
  #Pull body of story (DACS Transcript version)
  if (length(html %>%
    html_nodes(xpath="//*[@class='transcript storytext']") %>%
    html_text()) > 0){
    
      html2 <- html %>%
        html_nodes(xpath="//*[@class='transcript storytext']") %>%
        html_text() %>%
        #Remove speaker names and descriptions of ambient sound
        str_remove_all("(\\b[A-Z]+ *\\b[:]*){2,}") %>% #remove two words in all caps that may or may not have a space after them (names, ambi)
        str_remove_all ("[, ]*[A-Z]+:") %>% #remove words with a colon after them (, BYLINE:, HOST:, etc.)
        str_remove_all ("\\(\\)") %>%
        str_remove_all ("A MARTÍNEZ") %>%
        str_squish()
        
      #Delete content after copyright notice
      html2 <- html2 %>%
        str_split ("Copyright ©")
      
      #Keep only before copyright notice
      html2 <- html2[[1]][[1]]
      
      done = 1
  }
  
  #Pull article if it's not a DACS auto-transcript
  #if (length(html %>%
  #    html_nodes(xpath="//*[@class='story']") %>%
  #    html_text()) > 0){
  if (done==0){

      html2 <- html %>%
        html_nodes(xpath="//*[@id='storytext']") %>%
        html_nodes("p") %>%
        #Unsuccessfully tried to remove photo captions at top
        #html_nodes("p>:not(xpath='//*[@class='credit-caption']") %>%
        #html_attrs(">:not(xpath='//*[@class='credit-caption']')") %>%
        html_text() %>%
        #Remove speaker names and descriptions of ambient sound
        str_remove_all("(\\b[A-Z]+ *\\b[:]*){2,}") %>% #remove two words in all caps that may or may not have a space after them (names, ambi)
        str_remove_all ("[, ]*[A-Z]+:") %>% #remove words with a colon after them (, BYLINE:, HOST:, etc.)
        str_remove_all ("\\(\\)") %>%
        str_remove_all ("A MARTÍNEZ") %>%
        str_remove_all ("Enlarge this image") %>%
        as.character %>%
        str_squish() %>%
        toString 
        
    #Delete content after phrase "More Stories From"
      html2 <- html2 %>%
        str_split ("More Stories From")
      
      #Keep only before copyright notice
      html2 <- html2[[1]][[1]]
  
  }
  
  #Check that I've pulled an article and break for loop if it's missing
  check <- length(html2)
  if (check<1){
    stop ("ERROR")
  }
  
  #Pull byline
  byline <- html %>%
    html_nodes(xpath="//*[@id='storybyline']/div/div/div/p") %>%
    html_text() %>%
    str_squish() %>%
    toString()
  
  #Pull slug (climate, weather, national, etc.)
  slug <- html %>%
    html_node(xpath="//h3[@class='slug']") %>%
    html_text() %>%
    str_squish() %>%
    toString()
    
  #Save to dataframe
  this_row <- tibble(byline=byline, body=html2, slug=slug)
  output <- bind_rows (output, this_row)
  rm(this_row, byline, html2, html, source, done, slug, check)
}

#Join scrape results to headline, url and date
join <- cbind (df, output)

#Save file
write_csv (join, "npr_scrape.csv")

Examine results

What desks were these stories assigned on NPR.org?

join <- read_csv("npr_scrape.csv")

#Check how are these categorized?
join %>% 
  count (slug) %>%
  arrange (desc(n)) %>%
  print
## # A tibble: 33 × 2
##    slug                                n
##    <chr>                           <int>
##  1 Climate                           132
##  2 Weather                            22
##  3 National                           16
##  4 Short Wave                         14
##  5 Environment                        13
##  6 Consider This from NPR              7
##  7 Goats and Soda                      7
##  8 The Indicator from Planet Money     7
##  9 Business                            6
## 10 <NA>                                6
## # … with 23 more rows

Take a look at first story to check it scraped properly. The all-caps names of reporters and hosts and ambient sound descriptions have been removed.

join %>%
  filter (row_number()==1) %>%
  pull (slug) %>%
  paste ("SLUG:", .) %>%
  print 
## [1] "SLUG: Climate"
join %>%
  filter (row_number()==1) %>%
  pull (headline) %>%
  paste ("HED:", .) %>%
  print 
## [1] "HED: Scientists warn California's floods may be a sample of Megafloods to come"
join %>%
  filter (row_number()==1) %>%
  pull (url) %>%
  paste ("URL:", .) %>%
  print 
## [1] "URL: https://www.npr.org/2023/04/05/1168116210/scientists-warn-californias-floods-may-be-a-sample-of-megafloods-to-come"
join %>%
  filter (row_number()==1) %>%
  pull (byline) %>%
  paste ("BYLINE:", .) %>%
  print 
## [1] "BYLINE: Ezra David Romero"
join %>%
  filter (row_number()==1) %>%
  pull (body) %>%
  paste ("BODY:", .) %>%
  print 
## [1] "BODY: California residents are beginning to recover from a string of intense storms this winter that caused levee failures and flooding. Some climate scientists say the wild weather is just a sample of what we can expect in a warmer world. KQED's Ezra David Romero has this report. Antonio Hueso evacuated his home in the early morning hours of March 12. That's when the Pajaro River levee in Monterey County failed. His two-story home is about an hour south of San Francisco. The cameras on his daisy-yellow-colored house caught the water submerging his street and then his first floor. I check my cameras, 8 o' clock - this is a second river. It's the second time the 72-year-old's home has flooded because this levee failed. He's now considering leaving his home of nearly five decades. I'll fix the house, and when the people forget this, I sell my house, and I move to Madera or Fresno. I don't know. UCLA climate scientist Daniel Swain warns what Californians have lived through this winter is only a taste of what's to come as human-caused climate change continues. As disruptive as this year's events have been, we're nowhere near close to a plausible worst case storm and flood scenario for California. Swain is clear about the links between climate change and the increase in extreme flooding. In a study last year, Swain looked at the worst-case scenario - a weekslong parade of extreme atmospheric rivers which California did not have this year. Swain found the warming climate has already doubled the probability of a megaflood. Such catastrophic flooding could create more than $1 trillion in damage. It could happen next year, or it might not happen for 100 years. If this pattern of back-to-back atmospheric rivers sounds familiar, it's because Californians are witnessing an echo of this. Swain says the main differences are that this winter's storms had breaks between them and that none of the storms were considered extreme. We see that it is possible to have years where there are multiple atmospheric rivers in a row that are much stronger than what we saw at any point this year. California is taking Swain's predictions seriously. Michael Anderson is a state climatologist. He's trying to convince the state to fund a project that would model severe flooding scenarios, considering climate data, weather forecasting and local conditions. Anderson says this would give the state a heads-up on just how severe a storm pattern could be, what's at risk of flooding and who should evacuate. Unfortunately, Mother Nature kind of beat us to the punch here. But we're working on trying to develop a capability to kind of help us better understand how to recognize when things are scaling up so that you get the right level of response dialed in. And it's a tool we don't have right now. The project could be completed in a year if the state approves it. The planning is already late for people dealing with flooding from failed levees this winter. The squishy sound is still water being here and mud. Denia Escutia woke up to the sound of water trickling into her room hours after the Pajaro levee broke. My feet touched the rug, and the rug was wet. Escutia is 18 and is questioning whether Pajaro can remain home due to the effects of human-caused climate change.What do you think your future will look like if you stay here? My future - I feel like it will look like gone. Gone because the climate the levee was designed for no longer exists.For NPR News, I'm Ezra David Romero."

Analyze results

This code tokenizes the text corpus to find frequency of words and word pairs within 272 stories published from Oct. 1, 2022-April 5, 2023. It does this by removing stop words, which are so common that they don’t provide insight for textual analysis (e.g., “a”, “an”, “the”).

The resulting list displays how frequently almost 29,000 unique word pairs appeared in NPR’s climate coverage over the last sixth months.

join2 <- join %>%
  unnest_tokens(word, body, token = "words") 

#Create list of words and remove stop words
join_words <- join2 %>%
  anti_join(stop_words, by="word") %>%  #went from 236700 to 107524
  filter(!str_detect(word, "http")) %>%
  filter(!str_detect(word, "t.co")) #removed an additional few words

#Remove URLs
replace_reg <- "https?://[^\\s]+|&amp;|&lt;|&gt;|\bRT\\b"

#Create list of word pairs (236430 word pairs)
join_bigrams <- join %>% 
  mutate(text = str_replace_all(body, replace_reg, "")) %>%
  unnest_tokens(bigram, body, token = "ngrams", n = 2) #kept returning error until added the collapse=FALSE statement

#Remove pairs containing stop words
join_bigrams <- join_bigrams %>%
  separate(bigram, into = c("first","second"), sep = " ", remove = FALSE) %>%
  anti_join(stop_words, by = c("first" = "word")) %>%
  anti_join(stop_words, by = c("second" = "word")) %>% #went from 236430 to 46028
  filter(str_detect(first, "[a-z]") &
         str_detect(second, "[a-z]")) %>% #goes to 43675
  filter(!str_detect(bigram, "t.co"))  #Reduced bigrams to 43490

#Count how many times each word appears in all coverage
words_count <- join_words %>%
  group_by(word) %>%
  count() %>%
  arrange(-n) %>%
  print
## # A tibble: 15,049 × 2
## # Groups:   word [15,049]
##    word          n
##    <chr>     <int>
##  1 climate    1323
##  2 people      749
##  3 change      675
##  4 water       650
##  5 gas         436
##  6 hide        372
##  7 caption     371
##  8 world       353
##  9 emissions   334
## 10 u.s         331
## # … with 15,039 more rows
#Count how many times each word pair appears in all coverage
#28726 unique word pairs
bigrams_count <- join_bigrams %>%
  group_by(bigram) %>%
  summarize (count = n(),
          stories = n_distinct(url)) %>%
  arrange(-count) %>%
  #Manually remove frequent word pairs that are inherent to NPR (reporter names, npr processes, etc)
  filter (!bigram %in% c("hide caption", "desk climate", "digital stub", "ramirez franco", "atc digital", "digital stories", "tbd digital", "climate desk", "digital story", "lauren sommer", "stories coming", "michael copley", "margaret cirino", "nate rott", "npr hide", "getty images", "images hide", "ap hide", "npr news", "ryan kellman", "kellman npr", "apple podcasts", "npr's climate")) %>%
  print 
## # A tibble: 28,726 × 3
##    bigram           count stories
##    <chr>            <int>   <int>
##  1 climate change     574     172
##  2 greenhouse gas     105      53
##  3 natural gas         82      22
##  4 gas emissions       78      44
##  5 fossil fuels        75      41
##  6 renewable energy    69      33
##  7 global warming      68      43
##  8 united nations      59      39
##  9 colorado river      53      10
## 10 fossil fuel         52      29
## # … with 28,716 more rows

Visualization

Total word frequency

This code pulls out top 25 bigrams by the total count of appearances across all stories published in this six-month period.

q <- bigrams_count %>%
  head(25)
  
graph <- ggplot(data=q, aes(x=fct_reorder(bigram, count), y=count)) +
  geom_bar(stat="identity") + 
  coord_flip() + 
  geom_text(aes(label = stories), vjust = 0.45, hjust=1.2, colour = "gold", size = 3) +
  labs(title="Top 25 word pairs from NPR's climate coverage (10/22-04/23)", 
       x="Word Pairs", 
       y="Number of times mentioned in NPR's climate coverage",
       subtitle="Yellow figures show count of stories mentioning topic") +
  scale_y_continuous(breaks=seq(0,600,by=100),
                     minor_breaks = seq(0,600,by=25)) +
  theme_linedraw() +
  theme (plot.subtitle = element_text(face = "italic", size=9))

print(graph)

ggsave(plot = graph,
       file = "outputs/Word pairs1.png")  
## Saving 7 x 5 in image

Story frequency

The phrase “responsible solar” was included 37 times in just one story published Feb. 18, 2023, and the phrase “heat pumps” was mentioned three dozen times in two stories at the end of March, so these are not truly frequently covered topics.

Instead, this code charts the number of unique articles in which each topic appeared on NPR.org.

q2 <- bigrams_count %>%
  arrange (-stories) %>%
  head(25)

graph2 <- ggplot(data=q2, aes(x=fct_reorder(bigram, stories), y=stories)) +
  geom_bar(stat="identity") + 
  coord_flip() + 
  #geom_text(aes(label = stories), vjust = 0.45, hjust=1.2, colour = "yellow", size = 3) +
  labs(title="Top 25 word pairs from NPR's climate coverage (10/22-04/23)", 
       x="Word Pairs", 
       y="Number of stories mentioning term in NPR's climate coverage",
       subtitle="Evaluated from 272 articles pulled from npr.org/sections/climate/archive"
       ) +
  scale_y_continuous(breaks=seq(0,175,by=20)) +
  theme_linedraw() +
  theme (plot.subtitle = element_text(face = "italic", size=9))

print(graph2)  

ggsave(plot = graph2,
       file = "outputs/Word pairs2.png")  
## Saving 7 x 5 in image

Checking phrases

Health

Question: How many stories mentioned “health”?
Answer: Only two stories mention “mental health” or “health concerns.”

topic1 <- bigrams_count %>%
  filter (str_detect (bigram, "health")) %>%
  slice_max (n=25, stories) %>%
  head (25) %>%
  ggplot(aes(x=fct_reorder(bigram, stories,), y=stories)) +
  geom_bar(stat="identity") + coord_flip() + 
  scale_y_continuous(breaks=seq(1,13,by=1)) +
  labs(title="\"Health\" within NPR's climate coverage (10/22-04/23)", x="Word Pairs", y="Count of stories") +
  theme_linedraw()

print(topic1)  

ggsave(plot = topic1,
       file = "outputs/Word pairs-health.png")  
## Saving 7 x 5 in image

Which stories reference the most frequent health topic?

search_term <- bigrams_count %>%
  filter (str_detect (bigram, "health")) %>%
  slice_max (n=1, stories) %>%
  pull (bigram)
  
join_bigrams %>%
  filter (str_detect (bigram, search_term)) %>%
  select (bigram, date, headline, url) %>%
  distinct (bigram, date, headline, url) %>%
  print
## # A tibble: 12 × 4
##    bigram        date       headline                                       url  
##    <chr>         <date>     <chr>                                          <chr>
##  1 public health 2023-04-04 There are plenty of doomsday climate stories … http…
##  2 public health 2023-03-31 There's a second outbreak of Marburg virus in… http…
##  3 public health 2023-02-04 Gas stove makers have a pollution solution. T… http…
##  4 public health 2023-01-31 Cleaner, healthier gas burners were developed… http…
##  5 public health 2023-01-21 Gas stoves became part of the culture war in … http…
##  6 public health 2023-01-06 How the EPA is cracking down with tighter lim… http…
##  7 public health 2022-12-28 Morning news brief                             http…
##  8 public health 2022-12-22 This is what's at risk from climate change in… http…
##  9 public health 2022-11-16 California unveils a plan to zero out greenho… http…
## 10 public health 2022-11-15 World leaders discuss high food and energy co… http…
## 11 public health 2022-11-15 The world population reaches 8 billion, posin… http…
## 12 public health 2022-11-14 FDA gives safety nod to 'no kill' meat, bring… http…

Native American

Question: How many stories mentioned “Native”, “indigenous”, or “tribal”?
Answer: A half-dozen mention indigenous people.

topic2 <- bigrams_count %>%
  filter (str_detect (bigram, "\\bnative|indigenous|tribal\\b")) %>%
  filter (!bigram %in% c("native species", "native plants")) %>%
  slice_max (n=25, stories) %>%
  head (25) %>%
  ggplot(aes(x=fct_reorder(bigram, stories,), y=stories)) +
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_y_continuous(breaks=seq(1,13,by=1)) +
  labs(title="\"Native\" within NPR's climate coverage (10/22-04/23)", x="Word Pairs", y="Count of stories") +
  theme_linedraw() 

print(topic2)  

ggsave(plot = topic2,
       file = "outputs/Word pairs-native.png")  
## Saving 7 x 5 in image

Which stories reference the most frequent topic connected to Native Americans, indigenous or tribal issues?

search_term <- bigrams_count %>%
  filter (str_detect (bigram, "\\bnative|indigenous|tribal\\b")) %>%
  slice_max (n=1, stories) %>%
  pull (bigram)
  
join_bigrams %>%
  filter (str_detect (bigram, search_term)) %>%
  select (bigram, date, headline, url) %>%
  distinct (bigram, date, headline, url) %>%
  print
## # A tibble: 10 × 4
##    bigram             date       headline                                  url  
##    <chr>              <date>     <chr>                                     <chr>
##  1 indigenous people  2023-04-02 Why deforestation means less rain in tro… http…
##  2 indigenous people  2023-03-31 From 4chan to international politics, a … http…
##  3 indigenous people  2023-03-20 The message from a U.N. climate report i… http…
##  4 indigenous peoples 2023-03-07 How to save a slow growing tree species   http…
##  5 indigenous peoples 2022-12-22 This is what's at risk from climate chan… http…
##  6 indigenous people  2022-12-22 This is what's at risk from climate chan… http…
##  7 indigenous peoples 2022-12-19 Negotiators at a U.N. biodiversity confe… http…
##  8 indigenous people  2022-12-07 A U.N. biodiversity convention aims to s… http…
##  9 indigenous peoples 2022-12-07 A U.N. biodiversity convention aims to s… http…
## 10 indigenous people  2022-12-06 Delegates meet with a mandate to set glo… http…

Flaring

Question: How many stories mentioned “flare” or “flaring”?
Answer: Very few stories. Just three separate stories had any mention of these terms, and one was AP copy.

#Very few stories mention flaring
topic3 <- bigrams_count %>%
  filter (str_detect (bigram, "flare|flaring")) %>%
  filter (!bigram %in% c("flare melilla", "eyes flare")) %>%
  slice_max (n=25, stories) %>%
  head (25) %>%
  ggplot(aes(x=fct_reorder(bigram, stories,), y=stories)) +
  geom_bar(stat="identity") + 
  coord_flip() + 
  scale_y_continuous(breaks=seq(1,13,by=1)) +
  labs(title="\"Flaring\" within NPR's climate coverage (10/22-04/23)", x="Word Pairs", y="Count of stories") +
  theme_linedraw() 

print(topic3)  

ggsave(plot = topic3,
       file = "outputs/Word pairs-flaring.png")  
## Saving 7 x 5 in image

Which stories reference flaring?

join_bigrams %>%
  filter (str_detect (bigram, "flare|flaring")) %>%
  filter (!bigram %in% c("flare melilla", "eyes flare")) %>%
  select (bigram, date, headline, url) %>%
  distinct (bigram, date, headline, url) %>%
  print
## # A tibble: 5 × 4
##   bigram           date       headline                                     url  
##   <chr>            <date>     <chr>                                        <chr>
## 1 methane flare    2022-11-17 Climate talks are wrapping up. The thornies… http…
## 2 routine flaring  2022-11-11 Here's what happened on Friday at the U.N.'… http…
## 3 minimize flaring 2022-11-11 Here's what happened on Friday at the U.N.'… http…
## 4 flaring methane  2022-11-11 Here's what happened on Friday at the U.N.'… http…
## 5 flare burning    2022-10-26 Greenhouse gases reach a new record as nati… http…

Chart story frequency over time

#How many stories were these terms mentioned in?
search_terms <- bigrams_count %>%
  #filter (n>9) %>%
  pull (bigram)

#search_terms <- c(q$bigram, "public health", "health risks")

analysis <- tibble()
for (i in 1:length(search_terms)){
  #i <- 1
  search <- search_terms[i]
  step_df <- join_bigrams %>%
    filter (bigram==search) %>%
    distinct (headline, date, url) %>%
    select (headline, date, url) %>%
    mutate(time_floor = floor_date(date, unit = "1 month")) %>%
    count(time_floor) %>%
    adorn_totals() %>%
    mutate (bigram=search) 
  
  analysis <- bind_rows (analysis, step_df)
}

analysis2 <- analysis %>%
  #filter(row_number()<60483) %>%
  pivot_wider (names_from = "time_floor",
               values_from = "n",
               values_fill = 0)

write_csv (analysis2, "bigrams_story_count.csv")
analysis2 <- read_csv( "bigrams_story_count.csv")

#Double check story count performed above
#analysis2 %>%
#  slice_max(n=25, Total) %>%
#  ggplot(aes(x=fct_reorder(bigram, Total), y=Total)) +
#  geom_bar(stat="identity") + coord_flip() + 
#  labs(title="Top word pairs in NPR's climate coverage (10/22-04/23)", x="Word Pairs", y="Count of stories") %>%
#  print

search_terms <- analysis2 %>%
  slice_max(Total, n=5) %>%
  pull (bigram)

#Chart top 5 topics' frequency over time
graph4 <- analysis2 %>%
  filter (bigram %in% search_terms) %>%
  select (-c(Total, `2023-04-01`)) %>%
  pivot_longer (cols=(2:7),
                names_to="date", 
                values_to="value") %>%
  mutate (date = ymd(date)) %>%
  ggplot(aes(x=date, y=value, group=bigram, color=bigram)) +
  geom_line() +
  geom_line(size = 1.3) +
  scale_x_date(date_breaks = "months" , date_labels = "%b '%y") + 
  labs(title="Most frequent topics within NPR's climate coverage", 
       x=NULL, 
       y="Count of stories",
       color="Topic") +
  theme_linedraw() 

print(graph4)  

ggsave(plot = graph4,
       file = "outputs/Word pairs-over time.png")